home *** CD-ROM | disk | FTP | other *** search
/ Amiga Plus 2004 #11 / Amiga Plus CD - 2004 - No. 11.iso / AmiSoft / Comm / www / tidy_os4.lha / tidy / src / win32tc.c < prev    next >
C/C++ Source or Header  |  2004-07-25  |  36KB  |  785 lines

  1. /* win32tc.c -- Interface to Win32 transcoding routines
  2.  
  3.   (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  4.   See tidy.h for the copyright notice.
  5.  
  6.   $Id: win32tc.c,v 1.4 2003/04/30 23:32:21 hoehrmann Exp $
  7. */
  8.  
  9. /* keep these here to keep file non-empty */
  10. #include <tidy.h>
  11. #include "forward.h"
  12. #include "streamio.h"
  13. #include "tmbstr.h"
  14. #include "utf8.h"
  15.  
  16. #ifdef TIDY_WIN32_MLANG_SUPPORT
  17.  
  18. #define VC_EXTRALEAN
  19. #define CINTERFACE
  20. #define COBJMACROS
  21.  
  22. #include <windows.h>
  23. #include <mlang.h>
  24.  
  25. #undef COBJMACROS
  26. #undef CINTERFACE
  27. #undef VC_EXTRALEAN
  28.  
  29. /* maximum number of bytes for a single character */
  30. #define TC_INBUFSIZE  16
  31.  
  32. /* maximum number of characters per byte sequence */
  33. #define TC_OUTBUFSIZE 16
  34.  
  35. #define CreateMLangObject(p) \
  36.   CoCreateInstance( \
  37.         &CLSID_CMLangConvertCharset, \
  38.         NULL, \
  39.         CLSCTX_ALL, \
  40.         &IID_IMLangConvertCharset, \
  41.         (VOID **)&p);
  42.  
  43.  
  44. /* Character Set to Microsoft Windows Codepage Identifier map,     */
  45. /* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
  46.  
  47. /* note: the 'safe' field indicates whether this encoding can be   */
  48. /* read/written character-by-character; this does not apply to     */
  49. /* various stateful encodings such as ISO-2022 or UTF-7, these     */
  50. /* must be read/written as a complete stream. It is possible that  */
  51. /* some 'unsafe' encodings are marked as 'save'.                   */
  52.  
  53. /* todo: cleanup; Tidy should use only a single mapping table to   */
  54. /* circumvent unsupported aliases in other transcoding libraries,  */
  55. /* enable reverse lookup of encoding names and ease maintenance.   */
  56.  
  57. static struct _nameWinCPMap
  58. {
  59.     tmbstr name;
  60.     uint wincp;
  61.     Bool safe;
  62. } NameWinCPMap[] = {
  63.   { "cp037",                                            37, yes },
  64.   { "csibm037",                                         37, yes },
  65.   { "ebcdic-cp-ca",                                     37, yes },
  66.   { "ebcdic-cp-nl",                                     37, yes },
  67.   { "ebcdic-cp-us",                                     37, yes },
  68.   { "ebcdic-cp-wt",                                     37, yes },
  69.   { "ibm037",                                           37, yes },
  70.   { "cp437",                                           437, yes },
  71.   { "cspc8codepage437",                                437, yes },
  72.   { "ibm437",                                          437, yes },
  73.   { "cp500",                                           500, yes },
  74.   { "csibm500",                                        500, yes },
  75.   { "ebcdic-cp-be",                                    500, yes },
  76.   { "ebcdic-cp-ch",                                    500, yes },
  77.   { "ibm500",                                          500, yes },
  78.   { "asmo-708",                                        708, yes },
  79.   { "dos-720",                                         720, yes },
  80.   { "ibm737",                                          737, yes },
  81.   { "ibm775",                                          775, yes },
  82.   { "cp850",                                           850, yes },
  83.   { "ibm850",                                          850, yes },
  84.   { "cp852",                                           852, yes },
  85.   { "ibm852",                                          852, yes },
  86.   { "cp855",                                           855, yes },
  87.   { "ibm855",                                          855, yes },
  88.   { "cp857",                                           857, yes },
  89.   { "ibm857",                                          857, yes },
  90.   { "ccsid00858",                                      858, yes },
  91.   { "cp00858",                                         858, yes },
  92.   { "cp858",                                           858, yes },
  93.   { "ibm00858",                                        858, yes },
  94.   { "pc-multilingual-850+euro",                        858, yes },
  95.   { "cp860",                                           860, yes },
  96.   { "ibm860",                                          860, yes },
  97.   { "cp861",                                           861, yes },
  98.   { "ibm861",                                          861, yes },
  99.   { "cp862",                                           862, yes },
  100.   { "dos-862",                                         862, yes },
  101.   { "ibm862",                                          862, yes },
  102.   { "cp863",                                           863, yes },
  103.   { "ibm863",                                          863, yes },
  104.   { "cp864",                                           864, yes },
  105.   { "ibm864",                                          864, yes },
  106.   { "cp865",                                           865, yes },
  107.   { "ibm865",                                          865, yes },
  108.   { "cp866",                                           866, yes },
  109.   { "ibm866",                                          866, yes },
  110.   { "cp869",                                           869, yes },
  111.   { "ibm869",                                          869, yes },
  112.   { "cp870",                                           870, yes },
  113.   { "csibm870",                                        870, yes },
  114.   { "ebcdic-cp-roece",                                 870, yes },
  115.   { "ebcdic-cp-yu",                                    870, yes },
  116.   { "ibm870",                                          870, yes },
  117.   { "dos-874",                                         874, yes },
  118.   { "iso-8859-11",                                     874, yes },
  119.   { "tis-620",                                         874, yes },
  120.   { "windows-874",                                     874, yes },
  121.   { "cp875",                                           875, yes },
  122.   { "csshiftjis",                                      932, yes },
  123.   { "cswindows31j",                                    932, yes },
  124.   { "ms_kanji",                                        932, yes },
  125.   { "shift-jis",                                       932, yes },
  126.   { "shift_jis",                                       932, yes },
  127.   { "sjis",                                            932, yes },
  128.   { "x-ms-cp932",                                      932, yes },
  129.   { "x-sjis",                                          932, yes },
  130.   { "chinese",                                         936, yes },
  131.   { "cn-gb",                                           936, yes },
  132.   { "csgb2312",                                        936, yes },
  133.   { "csgb231280",                                      936, yes },
  134.   { "csiso58gb231280",                                 936, yes },
  135.   { "gb2312",                                          936, yes },
  136.   { "gb2312-80",                                       936, yes },
  137.   { "gb231280",                                        936, yes },
  138.   { "gb_2312-80",                                      936, yes },
  139.   { "gbk",                                             936, yes },
  140.   { "iso-ir-58",                                       936, yes },
  141.   { "csksc56011987",                                   949, yes },
  142.   { "iso-ir-149",                                      949, yes },
  143.   { "korean",                                          949, yes },
  144.   { "ks-c-5601",                                       949, yes },
  145.   { "ks-c5601",                                        949, yes },
  146.   { "ks_c_5601",                                       949, yes },
  147.   { "ks_c_5601-1987",                                  949, yes },
  148.   { "ks_c_5601-1989",                                  949, yes },
  149.   { "ks_c_5601_1987",                                  949, yes },
  150.   { "ksc5601",                                         949, yes },
  151.   { "ksc_5601",                                        949, yes },
  152.   { "big5",                                            950, yes },
  153.   { "big5-hkscs",                                      950, yes },
  154.   { "cn-big5",                                         950, yes },
  155.   { "csbig5",                                          950, yes },
  156.   { "x-x-big5",                                        950, yes },
  157.   { "cp1026",                                         1026, yes },
  158.   { "csibm1026",                                      1026, yes },
  159.   { "ibm1026",                                        1026, yes },
  160.   { "ibm01047",                                       1047, yes },
  161.   { "ccsid01140",                                     1140, yes },
  162.   { "cp01140",                                        1140, yes },
  163.   { "ebcdic-us-37+euro",                              1140, yes },
  164.   { "ibm01140",                                       1140, yes },
  165.   { "ccsid01141",                                     1141, yes },
  166.   { "cp01141",                                        1141, yes },
  167.   { "ebcdic-de-273+euro",                             1141, yes },
  168.   { "ibm01141",                                       1141, yes },
  169.   { "ccsid01142",                                     1142, yes },
  170.   { "cp01142",                                        1142, yes },
  171.   { "ebcdic-dk-277+euro",                             1142, yes },
  172.   { "ebcdic-no-277+euro",                             1142, yes },
  173.   { "ibm01142",                                       1142, yes },
  174.   { "ccsid01143",                                     1143, yes },
  175.   { "cp01143",                                        1143, yes },
  176.   { "ebcdic-fi-278+euro",                             1143, yes },
  177.   { "ebcdic-se-278+euro",                             1143, yes },
  178.   { "ibm01143",                                       1143, yes },
  179.   { "ccsid01144",                                     1144, yes },
  180.   { "cp01144",                                        1144, yes },
  181.   { "ebcdic-it-280+euro",                             1144, yes },
  182.   { "ibm01144",                                       1144, yes },
  183.   { "ccsid01145",                                     1145, yes },
  184.   { "cp01145",                                        1145, yes },
  185.   { "ebcdic-es-284+euro",                             1145, yes },
  186.   { "ibm01145",                                       1145, yes },
  187.   { "ccsid01146",                                     1146, yes },
  188.   { "cp01146",                                        1146, yes },
  189.   { "ebcdic-gb-285+euro",                             1146, yes },
  190.   { "ibm01146",                                       1146, yes },
  191.   { "ccsid01147",                                     1147, yes },
  192.   { "cp01147",                                        1147, yes },
  193.   { "ebcdic-fr-297+euro",                             1147, yes },
  194.   { "ibm01147",                                       1147, yes },
  195.   { "ccsid01148",                                     1148, yes },
  196.   { "cp01148",                                        1148, yes },
  197.   { "ebcdic-international-500+euro",                  1148, yes },
  198.   { "ibm01148",                                       1148, yes },
  199.   { "ccsid01149",                                     1149, yes },
  200.   { "cp01149",                                        1149, yes },
  201.   { "ebcdic-is-871+euro",                             1149, yes },
  202.   { "ibm01149",                                       1149, yes },
  203.   { "iso-10646-ucs-2",                                1200, yes },
  204.   { "ucs-2",                                          1200, yes },
  205.   { "unicode",                                        1200, yes },
  206.   { "utf-16",                                         1200, yes },
  207.   { "utf-16le",                                       1200, yes },
  208.   { "unicodefffe",                                    1201, yes },
  209.   { "utf-16be",                                       1201, yes },
  210.   { "windows-1250",                                   1250, yes },
  211.   { "x-cp1250",                                       1250, yes },
  212.   { "windows-1251",                                   1251, yes },
  213.   { "x-cp1251",                                       1251, yes },
  214.   { "windows-1252",                                   1252, yes },
  215.   { "x-ansi",                                         1252, yes },
  216.   { "windows-1253",                                   1253, yes },
  217.   { "windows-1254",                                   1254, yes },
  218.   { "windows-1255",                                   1255, yes },
  219.   { "cp1256",                                         1256, yes },
  220.   { "windows-1256",                                   1256, yes },
  221.   { "windows-1257",                                   1257, yes },
  222.   { "windows-1258",                                   1258, yes },
  223.   { "johab",                                          1361, yes },
  224.   { "macintosh",                                     10000, yes },
  225.   { "x-mac-japanese",                                10001, yes },
  226.   { "x-mac-chinesetrad",                             10002, yes },
  227.   { "x-mac-korean",                                  10003, yes },
  228.   { "x-mac-arabic",                                  10004, yes },
  229.   { "x-mac-hebrew",                                  10005, yes },
  230.   { "x-mac-greek",                                   10006, yes },
  231.   { "x-mac-cyrillic",                                10007, yes },
  232.   { "x-mac-chinesesimp",                             10008, yes },
  233.   { "x-mac-romanian",                                10010, yes },
  234.   { "x-mac-ukrainian",                               10017, yes },
  235.   { "x-mac-thai",                                    10021, yes },
  236.   { "x-mac-ce",                                      10029, yes },
  237.   { "x-mac-icelandic",                               10079, yes },
  238.   { "x-mac-turkish",                                 10081, yes },
  239.   { "x-mac-croatian",                                10082, yes },
  240.   { "x-chinese-cns",                                 20000, yes },
  241.   { "x-cp20001",                                     20001, yes },
  242.   { "x-chinese-eten",                                20002, yes },
  243.   { "x-cp20003",                                     20003, yes },
  244.   { "x-cp20004",                                     20004, yes },
  245.   { "x-cp20005",                                     20005, yes },
  246.   { "irv",                                           20105, yes },
  247.   { "x-ia5",                                         20105, yes },
  248.   { "din_66003",                                     20106, yes },
  249.   { "german",                                        20106, yes },
  250.   { "x-ia5-german",                                  20106, yes },
  251.   { "sen_850200_b",                                  20107, yes },
  252.   { "swedish",                                       20107, yes },
  253.   { "x-ia5-swedish",                                 20107, yes },
  254.   { "norwegian",                                     20108, yes },
  255.   { "ns_4551-1",                                     20108, yes },
  256.   { "x-ia5-norwegian",                               20108, yes },
  257.   { "ansi_x3.4-1968",                                20127, yes },
  258.   { "ansi_x3.4-1986",                                20127, yes },
  259.   { "ascii",                                         20127, yes },
  260.   { "cp367",                                         20127, yes },
  261.   { "csascii",                                       20127, yes },
  262.   { "ibm367",                                        20127, yes },
  263.   { "iso-ir-6",                                      20127, yes },
  264.   { "iso646-us",                                     20127, yes },
  265.   { "iso_646.irv:1991",                              20127, yes },
  266.   { "us",                                            20127, yes },
  267.   { "us-ascii",                                      20127, yes },
  268.   { "x-cp20261",                                     20261, yes },
  269.   { "x-cp20269",                                     20269, yes },
  270.   { "cp273",                                         20273, yes },
  271.   { "csibm273",                                      20273, yes },
  272.   { "ibm273",                                        20273, yes },
  273.   { "csibm277",                                      20277, yes },
  274.   { "ebcdic-cp-dk",                                  20277, yes },
  275.   { "ebcdic-cp-no",                                  20277, yes },
  276.   { "ibm277",                                        20277, yes },
  277.   { "cp278",                                         20278, yes },
  278.   { "csibm278",                                      20278, yes },
  279.   { "ebcdic-cp-fi",                                  20278, yes },
  280.   { "ebcdic-cp-se",                                  20278, yes },
  281.   { "ibm278",                                        20278, yes },
  282.   { "cp280",                                         20280, yes },
  283.   { "csibm280",                                      20280, yes },
  284.   { "ebcdic-cp-it",                                  20280, yes },
  285.   { "ibm280",                                        20280, yes },
  286.   { "cp284",                                         20284, yes },
  287.   { "csibm284",                                      20284, yes },
  288.   { "ebcdic-cp-es",                                  20284, yes },
  289.   { "ibm284",                                        20284, yes },
  290.   { "cp285",                                         20285, yes },
  291.   { "csibm285",                                      20285, yes },
  292.   { "ebcdic-cp-gb",                                  20285, yes },
  293.   { "ibm285",                                        20285, yes },
  294.   { "cp290",                                         20290, yes },
  295.   { "csibm290",                                      20290, yes },
  296.   { "ebcdic-jp-kana",                                20290, yes },
  297.   { "ibm290",                                        20290, yes },
  298.   { "cp297",                                         20297, yes },
  299.   { "csibm297",                                      20297, yes },
  300.   { "ebcdic-cp-fr",                                  20297, yes },
  301.   { "ibm297",                                        20297, yes },
  302.   { "cp420",                                         20420, yes },
  303.   { "csibm420",                                      20420, yes },
  304.   { "ebcdic-cp-ar1",                                 20420, yes },
  305.   { "ibm420",                                        20420, yes },
  306.   { "cp423",                                         20423, yes },
  307.   { "csibm423",                                      20423, yes },
  308.   { "ebcdic-cp-gr",                                  20423, yes },
  309.   { "ibm423",                                        20423, yes },
  310.   { "cp424",                                         20424, yes },
  311.   { "csibm424",                                      20424, yes },
  312.   { "ebcdic-cp-he",                                  20424, yes },
  313.   { "ibm424",                                        20424, yes },
  314.   { "x-ebcdic-koreanextended",                       20833, yes },
  315.   { "csibmthai",                                     20838, yes },
  316.   { "ibm-thai",                                      20838, yes },
  317.   { "cskoi8r",                                       20866, yes },
  318.   { "koi",                                           20866, yes },
  319.   { "koi8",                                          20866, yes },
  320.   { "koi8-r",                                        20866, yes },
  321.   { "koi8r",                                         20866, yes },
  322.   { "cp871",                                         20871, yes },
  323.   { "csibm871",                                      20871, yes },
  324.   { "ebcdic-cp-is",                                  20871, yes },
  325.   { "ibm871",                                        20871, yes },
  326.   { "cp880",                                         20880, yes },
  327.   { "csibm880",                                      20880, yes },
  328.   { "ebcdic-cyrillic",                               20880, yes },
  329.   { "ibm880",                                        20880, yes },
  330.   { "cp905",                                         20905, yes },
  331.   { "csibm905",                                      20905, yes },
  332.   { "ebcdic-cp-tr",                                  20905, yes },
  333.   { "ibm905",                                        20905, yes },
  334.   { "ccsid00924",                                    20924, yes },
  335.   { "cp00924",                                       20924, yes },
  336.   { "ebcdic-latin9--euro",                           20924, yes },
  337.   { "ibm00924",                                      20924, yes },
  338.   { "x-cp20936",                                     20936, yes },
  339.   { "x-cp20949",                                     20949, yes },
  340.   { "cp1025",                                        21025, yes },
  341.   { "x-cp21027",                                     21027, yes },
  342.   { "koi8-ru",                                       21866, yes },
  343.   { "koi8-u",                                        21866, yes },
  344.   { "cp819",                                         28591, yes },
  345.   { "csisolatin1",                                   28591, yes },
  346.   { "ibm819",                                        28591, yes },
  347.   { "iso-8859-1",                                    28591, yes },
  348.   { "iso-ir-100",                                    28591, yes },
  349.   { "iso8859-1",                                     28591, yes },
  350.   { "iso_8859-1",                                    28591, yes },
  351.   { "iso_8859-1:1987",                               28591, yes },
  352.   { "l1",                                            28591, yes },
  353.   { "latin1",                                        28591, yes },
  354.   { "csisolatin2",                                   28592, yes },
  355.   { "iso-8859-2",                                    28592, yes },
  356.   { "iso-ir-101",                                    28592, yes },
  357.   { "iso8859-2",                                     28592, yes },
  358.   { "iso_8859-2",                                    28592, yes },
  359.   { "iso_8859-2:1987",                               28592, yes },
  360.   { "l2",                                            28592, yes },
  361.   { "latin2",                                        28592, yes },
  362.   { "csisolatin3",                                   28593, yes },
  363.   { "iso-8859-3",                                    28593, yes },
  364.   { "iso-ir-109",                                    28593, yes },
  365.   { "iso_8859-3",                                    28593, yes },
  366.   { "iso_8859-3:1988",                               28593, yes },
  367.   { "l3",                                            28593, yes },
  368.   { "latin3",                                        28593, yes },
  369.   { "csisolatin4",                                   28594, yes },
  370.   { "iso-8859-4",                                    28594, yes },
  371.   { "iso-ir-110",                                    28594, yes },
  372.   { "iso_8859-4",                                    28594, yes },
  373.   { "iso_8859-4:1988",                               28594, yes },
  374.   { "l4",                                            28594, yes },
  375.   { "latin4",                                        28594, yes },
  376.   { "csisolatincyrillic",                            28595, yes },
  377.   { "cyrillic",                                      28595, yes },
  378.   { "iso-8859-5",                                    28595, yes },
  379.   { "iso-ir-144",                                    28595, yes },
  380.   { "iso_8859-5",                                    28595, yes },
  381.   { "iso_8859-5:1988",                               28595, yes },
  382.   { "arabic",                                        28596, yes },
  383.   { "csisolatinarabic",                              28596, yes },
  384.   { "ecma-114",                                      28596, yes },
  385.   { "iso-8859-6",                                    28596, yes },
  386.   { "iso-ir-127",                                    28596, yes },
  387.   { "iso_8859-6",                                    28596, yes },
  388.   { "iso_8859-6:1987",                               28596, yes },
  389.   { "csisolatingreek",                               28597, yes },
  390.   { "ecma-118",                                      28597, yes },
  391.   { "elot_928",                                      28597, yes },
  392.   { "greek",                                         28597, yes },
  393.   { "greek8",                                        28597, yes },
  394.   { "iso-8859-7",                                    28597, yes },
  395.   { "iso-ir-126",                                    28597, yes },
  396.   { "iso_8859-7",                                    28597, yes },
  397.   { "iso_8859-7:1987",                               28597, yes },
  398.   { "csisolatinhebrew",                              28598, yes },
  399.   { "hebrew",                                        28598, yes },
  400.   { "iso-8859-8",                                    28598, yes },
  401.   { "iso-ir-138",                                    28598, yes },
  402.   { "iso_8859-8",                                    28598, yes },
  403.   { "iso_8859-8:1988",                               28598, yes },
  404.   { "logical",                                       28598, yes },
  405.   { "visual",                                        28598, yes },
  406.   { "csisolatin5",                                   28599, yes },
  407.   { "iso-8859-9",                                    28599, yes },
  408.   { "iso-ir-148",                                    28599, yes },
  409.   { "iso_8859-9",                                    28599, yes },
  410.   { "iso_8859-9:1989",                               28599, yes },
  411.   { "l5",                                            28599, yes },
  412.   { "latin5",                                        28599, yes },
  413.   { "iso-8859-13",                                   28603, yes },
  414.   { "csisolatin9",                                   28605, yes },
  415.   { "iso-8859-15",                                   28605, yes },
  416.   { "iso_8859-15",                                   28605, yes },
  417.   { "l9",                                            28605, yes },
  418.   { "latin9",                                        28605, yes },
  419.   { "x-europa",                                      29001, yes },
  420.   { "iso-8859-8-i",                                  38598, yes },
  421.   { "iso-2022-jp",                                   50220,  no },
  422.   { "csiso2022jp",                                   50221,  no },
  423.   { "csiso2022kr",                                   50225,  no },
  424.   { "iso-2022-kr",                                   50225,  no },
  425.   { "iso-2022-kr-7",                                 50225,  no },
  426.   { "iso-2022-kr-7bit",                              50225,  no },
  427.   { "cp50227",                                       50227,  no },
  428.   { "x-cp50227",                                     50227,  no },
  429.   { "cp930",                                         50930, yes },
  430.   { "x-ebcdic-japaneseanduscanada",                  50931, yes },
  431.   { "cp933",                                         50933, yes },
  432.   { "cp935",                                         50935, yes },
  433.   { "cp937",                                         50937, yes },
  434.   { "cp939",                                         50939, yes },
  435.   { "cseucpkdfmtjapanese",                           51932, yes },
  436.   { "euc-jp",                                        51932, yes },
  437.   { "extended_unix_code_packed_format_for_japanese", 51932, yes },
  438.   { "iso-2022-jpeuc",                                51932, yes },
  439.   { "x-euc",                                         51932, yes },
  440.   { "x-euc-jp",                                      51932, yes },
  441.   { "euc-cn",                                        51936, yes },
  442.   { "x-euc-cn",                                      51936, yes },
  443.   { "cseuckr",                                       51949, yes },
  444.   { "euc-kr",                                        51949, yes },
  445.   { "iso-2022-kr-8",                                 51949, yes },
  446.   { "iso-2022-kr-8bit",                              51949, yes },
  447.   { "hz-gb-2312",                                    52936,  no },
  448.   { "gb18030",                                       54936, yes },
  449.   { "x-iscii-de",                                    57002, yes },
  450.   { "x-iscii-be",                                    57003, yes },
  451.   { "x-iscii-ta",                                    57004, yes },
  452.   { "x-iscii-te",                                    57005, yes },
  453.   { "x-iscii-as",                                    57006, yes },
  454.   { "x-iscii-or",                                    57007, yes },
  455.   { "x-iscii-ka",                                    57008, yes },
  456.   { "x-iscii-ma",                                    57009, yes },
  457.   { "x-iscii-gu",                                    57010, yes },
  458.   { "x-iscii-pa",                                    57011, yes },
  459.   { "csunicode11utf7",                               65000,  no },
  460.   { "unicode-1-1-utf-7",                             65000,  no },
  461.   { "unicode-2-0-utf-7",                             65000,  no },
  462.   { "utf-7",                                         65000,  no },
  463.   { "x-unicode-1-1-utf-7",                           65000,  no },
  464.   { "x-unicode-2-0-utf-7",                           65000,  no },
  465.   { "unicode-1-1-utf-8",                             65001, yes },
  466.   { "unicode-2-0-utf-8",                             65001, yes },
  467.   { "utf-8",                                         65001, yes },
  468.   { "x-unicode-1-1-utf-8",                           65001, yes },
  469.   { "x-unicode-2-0-utf-8",                           65001, yes },
  470.  
  471.   /* final entry */
  472.   { NULL,                                                0,  no }
  473. };
  474.  
  475. uint Win32MLangGetCPFromName(ctmbstr encoding)
  476. {
  477.     uint i;
  478.     tmbstr enc;
  479.  
  480.     /* ensure name is in lower case */
  481.     enc = tmbstrdup(encoding);
  482.     enc = tmbstrtolower(enc);
  483.  
  484.     for (i = 0; NameWinCPMap[i].name; ++i)
  485.     {
  486.         if (tmbstrcmp(NameWinCPMap[i].name, enc) == 0)
  487.         {
  488.             IMLangConvertCharset * p = NULL;
  489.             uint wincp = NameWinCPMap[i].wincp;
  490.             HRESULT hr;
  491.  
  492.             MemFree(enc);
  493.  
  494.             /* currently no support for unsafe encodings */
  495.             if (!NameWinCPMap[i].safe)
  496.                 return 0;
  497.  
  498.             /* hack for config.c */
  499.             CoInitialize(NULL);
  500.             hr = CreateMLangObject(p);
  501.  
  502.             if (hr != S_OK || !p)
  503.             {
  504.                 wincp = 0;
  505.             }
  506.             else
  507.             {
  508.                 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
  509.  
  510.                 if (hr != S_OK)
  511.                     wincp = 0;
  512.  
  513.                 IMLangConvertCharset_Release(p);
  514.                 p = NULL;
  515.             }
  516.  
  517.             CoUninitialize();
  518.  
  519.             return wincp;
  520.         }
  521.     }
  522.  
  523.     MemFree(enc);
  524.     return 0;
  525. }
  526.  
  527. Bool Win32MLangInitInputTranscoder(StreamIn * in, uint wincp)
  528. {
  529.     IMLangConvertCharset * p = NULL;
  530.     HRESULT hr;
  531.  
  532.     assert( in != NULL );
  533.  
  534.     CoInitialize(NULL);
  535.  
  536.     if (wincp == 0)
  537.     {
  538.         /* no codepage found for this encoding */
  539.         return no;
  540.     }
  541.  
  542.     hr = CreateMLangObject(p);
  543.  
  544.     if (hr != S_OK || !p)
  545.     {
  546.         /* MLang not supported */
  547.         return no;
  548.     }
  549.  
  550.     hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
  551.  
  552.     if (hr != S_OK)
  553.     {
  554.         /* encoding not supported, insufficient memory, etc. */
  555.         return no;
  556.     }
  557.  
  558.     in->mlang = (ulong)p;
  559.  
  560.     return yes;
  561. }
  562.  
  563. void Win32MLangUninitInputTranscoder(StreamIn * in)
  564. {
  565.     IMLangConvertCharset * p;
  566.  
  567.     assert( in != NULL );
  568.  
  569.     p = (IMLangConvertCharset *)in->mlang;
  570.     if (p)
  571.     {
  572.         IMLangConvertCharset_Release(p);
  573.         p = NULL;
  574.         in->mlang = (ulong)NULL;
  575.     }
  576.  
  577.     CoUninitialize();
  578. }
  579.  
  580. Bool Win32MLangInitOutputTranscoder(StreamOut * out, tmbstr encoding)
  581. {
  582.     IMLangConvertCharset * p = NULL;
  583.     HRESULT hr;
  584.     uint wincp;
  585.  
  586.     assert( out != NULL );
  587.  
  588.     CoInitialize(NULL);
  589.  
  590.     wincp = Win32MLangGetCPFromName(encoding);
  591.     if (wincp == 0)
  592.     {
  593.         /* no codepage found for this encoding */
  594.         return no;
  595.     }
  596.  
  597.     hr = CreateMLangObject(p);
  598.  
  599.     if (hr != S_OK || !p)
  600.     {
  601.         /* MLang not supported */
  602.         return no;
  603.     }
  604.  
  605.     IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
  606.  
  607.     if (hr != S_OK)
  608.     {
  609.         /* encoding not supported, insufficient memory, etc. */
  610.         return no;
  611.     }
  612.  
  613.     out->mlang = (ulong)p;
  614.  
  615.     return yes;
  616. }
  617.  
  618. void Win32MLangUninitOutputTranscoder(StreamOut * out)
  619. {
  620.     IMLangConvertCharset * p;
  621.  
  622.     assert( out != NULL );
  623.  
  624.     p = (IMLangConvertCharset *)out->mlang;
  625.     if (p)
  626.     {
  627.         IMLangConvertCharset_Release(p);
  628.         p = NULL;
  629.         out->mlang = (ulong)NULL;
  630.     }
  631.  
  632.     CoUninitialize();
  633. }
  634.  
  635. int Win32MLangGetChar(byte firstByte, StreamIn * in, uint * bytesRead)
  636. {
  637.     IMLangConvertCharset * p;
  638.     TidyInputSource * source;
  639.     CHAR inbuf[TC_INBUFSIZE] = { 0 };
  640.     WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
  641.     HRESULT hr = S_OK;
  642.     size_t inbufsize = 0;
  643.  
  644.     assert( in != NULL );
  645.     assert( &in->source != NULL );
  646.     assert( bytesRead != NULL );
  647.     assert( in->mlang != 0 );
  648.  
  649.     p = (IMLangConvertCharset *)in->mlang;
  650.     source = &in->source;
  651.  
  652.     inbuf[inbufsize++] = (CHAR)firstByte;
  653.  
  654.     while(inbufsize < TC_INBUFSIZE)
  655.     {
  656.         UINT outbufsize = TC_OUTBUFSIZE;
  657.         UINT readNow = inbufsize;
  658.         int nextByte = EndOfStream;
  659.  
  660.         hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
  661.  
  662.         assert( hr == S_OK );
  663.         assert( outbufsize <= 2 );
  664.  
  665.         if (outbufsize == 2)
  666.         {
  667.             /* U+10000-U+10FFFF are returned as a pair of surrogates */
  668.             tchar m = (tchar)outbuf[0];
  669.             tchar n = (tchar)outbuf[1];
  670.             assert( IsHighSurrogate(n) && IsLowSurrogate(m) );
  671.             *bytesRead = readNow;
  672.             return (int)CombineSurrogatePair(n, m);
  673.         }
  674.  
  675.         if (outbufsize == 1)
  676.         {
  677.             /* we found the character   */
  678.             /* set bytesRead and return */
  679.             *bytesRead = readNow;
  680.             return (int)outbuf[0];
  681.         }
  682.  
  683.         /* we need more bytes */
  684.         nextByte = source->getByte(source->sourceData);
  685.  
  686.         if (nextByte == EndOfStream)
  687.         {
  688.             /* todo: error message for broken stream? */
  689.  
  690.             *bytesRead = readNow;
  691.             return EndOfStream;
  692.         }
  693.  
  694.         inbuf[inbufsize++] = (CHAR)nextByte;
  695.     }
  696.  
  697.     /* No full character found after reading TC_INBUFSIZE bytes, */
  698.     /* give up to read this stream, it's obviously unreadable.   */
  699.  
  700.     /* todo: error message for broken stream? */
  701.     return EndOfStream;
  702. }
  703.  
  704. Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
  705. {
  706.     IMLangConvertCharset * p;
  707.     UINT i = 1;
  708.     HRESULT hr;
  709.     WCHAR inbuf[2] = { 0 };
  710.     UINT inbufsize = 0;
  711.  
  712.     assert( c != 0 );
  713.     assert( c <= 0x10FFFF );
  714.     assert( out != NULL );
  715.     assert( out->mlang != 0 );
  716.  
  717.     if (c > 0xFFFF)
  718.     {
  719.         tchar high = 0;
  720.         tchar low = 0;
  721.  
  722.         SplitSurrogatePair(c, &low, &high);
  723.  
  724.         inbuf[inbufsize++] = (WCHAR)low;
  725.         inbuf[inbufsize++] = (WCHAR)high;
  726.     }
  727.     else
  728.         inbuf[inbufsize++] = (WCHAR)c;
  729.  
  730.     p = (IMLangConvertCharset *)out->mlang;
  731.     hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
  732.  
  733.     return hr == S_OK ? yes : no;
  734. }
  735.  
  736. void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
  737. {
  738.     IMLangConvertCharset * p;
  739.     TidyOutputSink * sink;
  740.     CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
  741.     UINT outbufsize = TC_OUTBUFSIZE;
  742.     HRESULT hr = S_OK;
  743.     WCHAR inbuf[2] = { 0 };
  744.     UINT inbufsize = 0;
  745.     uint i;
  746.  
  747.     assert( c != 0 );
  748.     assert( c <= 0x10FFFF );
  749.     assert( bytesWritten != NULL );
  750.     assert( out != NULL );
  751.     assert( &out->sink != NULL );
  752.     assert( out->mlang != 0 );
  753.  
  754.     p = (IMLangConvertCharset *)out->mlang;
  755.     sink = &out->sink;
  756.  
  757.     if (c > 0xFFFF)
  758.     {
  759.         tchar high = 0;
  760.         tchar low = 0;
  761.  
  762.         SplitSurrogatePair(c, &low, &high);
  763.  
  764.         inbuf[inbufsize++] = (WCHAR)low;
  765.         inbuf[inbufsize++] = (WCHAR)high;
  766.     }
  767.     else
  768.         inbuf[inbufsize++] = (WCHAR)c;
  769.  
  770.     hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
  771.     
  772.     assert( hr == S_OK );
  773.     assert( outbufsize > 0 );
  774.     assert( inbufsize == 1 || inbufsize == 2 );
  775.  
  776.     for (i = 0; i < outbufsize; ++i)
  777.         sink->putByte(sink->sinkData, (byte)(outbuf[i]));
  778.  
  779.     *bytesWritten = outbufsize;
  780.  
  781.     return;
  782. }
  783.  
  784. #endif /* TIDY_WIN32_MLANG_SUPPORT */
  785.